package com.hui.crawler;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @version 0.0.1.
 * @Description des
 * @Author Administrator
 * @Date 2017/5/27 0027
 */
@Slf4j
public class AbstractCrawlerFactory implements CrawlerFactory {


    @Override
    public void getHttpClient() throws IOException {
        log.info("getHttpClient");
        URL url = new URL("https://www.624q.com/Html/93/");
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("GET");
        connection.setConnectTimeout(2000);
        connection.setReadTimeout(2000);

        if (connection.getResponseCode() == 200) {
            InputStream inputStream = connection.getInputStream();
            String html = IOUtils.toString(inputStream);
            Pattern pattern = Pattern.compile("<a.*?href=[\"']?((https?://)?/?[^\"']+)[\"']?.*?>(.+)</a>");
            Matcher matcher = pattern.matcher(html);
            if (matcher.find()) {
                String newLink = matcher.group(1).trim(); // 链接
            }
        }
    }


}
